{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook reads and merges .csv files for each folder in 'original_dataset_path'\n",
    "\n",
    "Each merged .csv is saved in a folder in 'processed_data_path'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import glob\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def csvs_merge(path_in, path_out, types_infos):\n",
    "    \"\"\" \n",
    "    Merge all .csv files of a 'file_type' in 'path_in'\n",
    "    The merged file is saved in \"data/processed_data/'dataset'/'folder'/merged_files.csv\"\n",
    "    \"\"\"\n",
    "\n",
    "    for file_type, type_infos in types_infos.items():\n",
    "        \n",
    "        # Getting all .csv files of type 'file_type'\n",
    "        csv_files = sorted(glob.glob('%s/%s*.csv' % (path_in, file_type)))\n",
    "        \n",
    "        # Some folders don't have all types\n",
    "        if not csv_files:\n",
    "            continue\n",
    "\n",
    "        # Determining separator type with the first .csv file\n",
    "        reader = pd.read_csv(csv_files[0], sep= None, iterator= True, engine='python')\n",
    "        inferred_sep = reader._engine.data.dialect.delimiter\n",
    "        \n",
    "        # Merging .csv files of type 'file_type'\n",
    "        merged_csv = pd.concat([ pd.read_csv('%s' %(f), \n",
    "                                 sep=inferred_sep, \n",
    "                                 usecols=type_infos['usecols'], \n",
    "                                 names=type_infos['names'], \n",
    "                                 header=None, engine='c') for f in csv_files ])\n",
    "\n",
    "        merged_csv.to_csv(\"%s/%s.csv\" % (path_out, file_type),\n",
    "                          index=False, encoding='utf-8-sig')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook expect the following directory structure but you can change it with \n",
    "'original_dataset_path' and 'processed_data_path' variables bellow\n",
    "\n",
    "#### Create the folders* indicated bellow\n",
    "#### After downloading FEMTOBearingDataSet.zip from [NASA repository](https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/) you can just extract 'Training_set.zip' and 'Validation_set.zip' to get all data"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "├── Bearing-RUL-Predict\n",
    "    ├── data                       <-- *Create this folder\n",
    "        ├── original_data          <-- *Create this folder\n",
    "        │   ├── femto_dataset      <-- *Create this folder and copy femto dataset BearingX_Y folders here\n",
    "        |   |   ├── Bearing1_1                                         \n",
    "        |   |   |   ├── acc_xxxxx.csv      \n",
    "        |   |   |   ├── ...\n",
    "        |   |   |   ├── temp_xxxxx.csv\n",
    "        |   |   ├── Bearing1_2\n",
    "        |   |   |   ├── acc_xxxxx.csv     \n",
    "        |   |   |   ├── ...\n",
    "        |   |   |   ├── temp_xxxxx.csv\n",
    "        |   |   ├── ...\n",
    "        │   └── ...\n",
    "        └── processed_data         <-- *Create this folder\n",
    "                                        All processed data will be stored in its subfolders (BearingX_Y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Defining paths, file type info and merging .csvs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "original_dataset_path = 'data/original_data/femto_dataset'\n",
    "processed_data_path   = 'data/processed_data/femto_dataset'\n",
    "\n",
    "types_infos = {\n",
    "    # file type identifier, columns to read, columns name\n",
    "    # μs column was removed (usecols 3)\n",
    "    'acc'  : {'usecols' : [0, 1, 2, 4, 5], 'names' : ['hour', 'min', 'seg', 'h', 'v']},\n",
    "    'temp' : {'usecols' : [0, 1, 2, 4],    'names' : ['hour', 'min', 'seg', 'temp']}\n",
    "}"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "bearing_rul_predict",
   "language": "python",
   "name": "bearing_rul_predict"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
